# install.packages(renv)
renv::init()
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
This project already has a lockfile. What would you like to do?
1: Restore the project from the lockfile.
2: Discard the lockfile and re-initialize the project.
3: Activate the project without snapshotting or installing any packages.
4: Abort project initialization.
1
* Restoring project ...
* The library is already synchronized with the lockfile.
renv::install("reticulate")
Retrieving 'https://packagemanager.rstudio.com/all/__linux__/focal/latest/src/contrib/reticulate_1.22.tar.gz' ...
OK [downloaded 2.3 Mb in 1.1 secs]
Retrieving 'https://packagemanager.rstudio.com/all/__linux__/focal/latest/src/contrib/here_1.0.1.tar.gz' ...
OK [downloaded 50.9 Kb in 1.4 secs]
Installing here [1.0.1] ...
OK [installed binary]
Installing reticulate [1.22] ...
OK [installed binary]
The following package(s) have been updated:
reticulate [installed version 1.22 != loaded version 1.18]
Consider restarting the R session and loading the newly-installed packages.
renv::use_python()
* Activated Python 3.8.8 [virtualenv; /nfs/team205/ed6/bin/Pan_fetal_immune/src/5_organ_signatures/MOFA_factor_analysis/renv/python/virtualenvs/renv-python-3.8.8]
* Project '/nfs/team205/ed6/bin/Pan_fetal_immune/src/5_organ_signatures/MOFA_factor_analysis' loaded. [renv 0.12.5]
* The project may be out of sync -- use `renv::status()` for more details.
py_pkgs <- c(
"scanpy",
"anndata",
"mofapy2"
)
reticulate::py_install(py_pkgs)
Error in (function (srcref) : unimplemented type (29) in 'eval'
# setwd('~/Pan_fetal_immune/src/5_organ_signatures/MOFA_factor_analysis/')
# renv::activate()
# renv::use_python()
#
# py_pkgs <- c(
# "scanpy",
# "anndata",
# "mofapy2"
# )
#
# reticulate::py_install(py_pkgs)
suppressPackageStartupMessages({
library(tidyverse)
library(MOFA2)
library(Matrix)
library(SingleCellExperiment)
library(scran)
library(glue)
library(scater)
library(patchwork)
library(batchelor)
library(rhdf5)
# library(ggraph)
}
)
Error: package or namespace load failed for ‘tidyverse’ in loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]):
namespace ‘ellipsis’ 0.3.1 is already loaded, but >= 0.3.2 is required
Define plotting utils
remove_x_axis <- function(){
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())
}
remove_y_axis <- function(){
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank())
}
org_colors <- read_csv("~/Pan_fetal_immune/metadata/organ_colors.csv")
Duplicated column names deduplicated: 'organ' => 'organ_1' [2]
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
organ = [31mcol_character()[39m,
organ_1 = [31mcol_character()[39m,
color = [31mcol_character()[39m
)
org_colors <- setNames(org_colors$color, org_colors$organ)
figdir <- "~/mount/gdrive/Pan_fetal/Updates_and_presentations/figures/MOFA_analysis/"
if (!dir.exists(figdir)){ dir.create(figdir) }
split = "LYMPHOID"
indir <- glue("/nfs/team205/ed6/data/Fetal_immune/LMM_data/LMM_input_{split}_PBULK/")
matrix <- readMM(file = paste0(indir, "matrix.mtx.gz"))
coldata <- read.csv(file = paste0(indir, "metadata.csv.gz")) %>%
column_to_rownames("X")
rowdata <- read.csv(file = paste0(indir, "gene.csv.gz"))
## Make SingleCellExperiment obj
sce <- SingleCellExperiment(list(logcounts = t(matrix)), colData = coldata)
rownames(sce) <- make.unique(rowdata$GeneName)
## Plot number of cells per organ/celltype pair
n_cells_heatmap <- data.frame(colData(sce)) %>%
group_by(anno_lvl_2_final_clean, organ) %>%
summarise(n_cells=sum(n_cells)) %>%
ggplot(aes(anno_lvl_2_final_clean, organ)) +
geom_tile(aes(fill=log10(n_cells))) +
geom_text(aes(label=n_cells), color="white") +
scale_fill_viridis_c() +
theme_classic(base_size = 16) +
xlab("celltype") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())
`summarise()` has grouped output by 'anno_lvl_2_final_clean'. You can override using the `.groups` argument.
n_samples_heatmap <- data.frame(colData(sce)) %>%
group_by(anno_lvl_2_final_clean, organ) %>%
summarise(n_samples=n()) %>%
ggplot(aes(anno_lvl_2_final_clean, organ)) +
geom_tile(aes(fill=n_samples)) +
geom_text(aes(label=n_samples), color="white") +
scale_fill_viridis_c(option="cividis") +
theme_classic(base_size = 16) +
xlab("celltype") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5))
`summarise()` has grouped output by 'anno_lvl_2_final_clean'. You can override using the `.groups` argument.
n_cells_heatmap / n_samples_heatmap
## Filter out samples with less than 20 cells
sce <- sce[,sce$n_cells > 20]
# Exclude celltypes present in just one organ
keep_ct <- data.frame(colData(sce)) %>%
dplyr::select(organ, anno_lvl_2_final_clean) %>%
distinct() %>%
group_by(anno_lvl_2_final_clean) %>%
summarise(n=n()) %>%
ungroup() %>%
filter(n > 1) %>%
pull(anno_lvl_2_final_clean)
sce <- sce[,sce$anno_lvl_2_final_clean %in% keep_ct]
# Filter out celltypes with less than 10 samples
keep_ct <- data.frame(colData(sce)) %>%
group_by(anno_lvl_2_final_clean) %>%
summarise(n_samples=n()) %>%
filter(n_samples >= 10) %>%
pull(anno_lvl_2_final_clean)
sce <- sce[,sce$anno_lvl_2_final_clean %in% keep_ct]
## Exclude low quality clusters
anno_groups <- jsonlite::fromJSON(txt = "~/Pan_fetal_immune/metadata/anno_groups.json")
sce <- sce[,!sce$anno_lvl_2_final_clean %in% anno_groups$OTHER]
## Exclude donor F19 (low Q)
sce <- sce[,!sce$donor %in% c('F19')]
## Exclude donors for which we have only one organ
data.frame(colData(sce))[c("Sample", "donor", "organ", 'age')] %>%
distinct(donor, organ, Sample, age) %>%
group_by(donor) %>%
mutate(n_organs = length(unique(organ))) %>%
ungroup() %>%
filter(n_organs >= 3) %>%
pull(donor) %>%
unique()
[1] "F45" "F23" "F30" "F38" "F41" "F29" "F35" "F51" "F21" "F32" "F50"
sce <- sce[,sce$donor %in% keep.donors]
## Plot number of cells per organ/celltype pair
n_cells_heatmap <- data.frame(colData(sce)) %>%
group_by(anno_lvl_2_final_clean, organ) %>%
summarise(n_cells=sum(n_cells)) %>%
ggplot(aes(anno_lvl_2_final_clean, organ)) +
geom_tile(aes(fill=log10(n_cells))) +
geom_text(aes(label=n_cells), color="white") +
scale_fill_viridis_c() +
theme_classic(base_size = 16) +
xlab("celltype") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())
`summarise()` has grouped output by 'anno_lvl_2_final_clean'. You can override using the `.groups` argument.
n_samples_heatmap <- data.frame(colData(sce)) %>%
group_by(anno_lvl_2_final_clean, organ) %>%
summarise(n_samples=n()) %>%
ggplot(aes(anno_lvl_2_final_clean, organ)) +
geom_tile(aes(fill=n_samples)) +
geom_text(aes(label=n_samples), color="white") +
scale_fill_viridis_c(option="cividis") +
theme_classic(base_size = 16) +
xlab("celltype") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5))
`summarise()` has grouped output by 'anno_lvl_2_final_clean'. You can override using the `.groups` argument.
n_cells_heatmap / n_samples_heatmap
organ_order <- c("YS", "LI", "BM", "TH", "SP", "SK","KI","GU", "MLN")
pl <- data.frame(colData(sce)) %>%
dplyr::group_by(anno_lvl_2_final_clean, organ) %>%
dplyr::summarise(n_cells=sum(n_cells), n_samples=dplyr::n()) %>%
dplyr::group_by(anno_lvl_2_final_clean) %>%
dplyr::mutate(n_organs=dplyr::n(), org_frac=n_cells/sum(n_cells)) %>%
## is the ct overrepresented in one organ?
dplyr::mutate(delta_max_org_frac = max(org_frac)-org_frac) %>%
dplyr::mutate(mean_delta_max_org_frac = mean(delta_max_org_frac)) %>%
dplyr::ungroup() %>%
dplyr::arrange(n_organs, -mean_delta_max_org_frac) %>%
dplyr::mutate(anno_lvl_2_final_clean=factor(anno_lvl_2_final_clean, levels=rev(unique(anno_lvl_2_final_clean)))) %>%
dplyr::mutate(organ=factor(organ, levels=rev(organ_order))) %>%
ggplot(aes(anno_lvl_2_final_clean, organ)) +
geom_point(aes(color=log10(n_cells), size=n_samples)) +
geom_text(aes(label=n_cells), color="white") +
scale_size(range=c(7,18), breaks = c(0,1,10,30), name="# samples") +
scale_fill_viridis_c() +
scale_color_viridis_c(name="log10(# cells)") +
theme_classic(base_size = 20) +
xlab("celltype") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5))
`summarise()` has grouped output by 'anno_lvl_2_final_clean'. You can override using the `.groups` argument.
## Save order of CTs from widespread to restricted
anno_order <- levels(pl$data$anno_lvl_2_final_clean)
ggsave(paste0(figdir, "ct_organ_distribution.pdf"), pl, width = 15, height = 10)
## Feature selection w scran WITHIN CELLTYPE
anno_groups <- split(colnames(sce), sce$anno_lvl_2_final_clean)
all_hvgs <- c()
for (i in anno_groups){
dec <- modelGeneVar(sce[,i])
hvgs <- getTopHVGs(dec, n = 1000)
all_hvgs <- union(all_hvgs, hvgs)
}
sce <- sce[which(rowSums(logcounts(sce)) > 0),]
sce
class: SingleCellExperiment
dim: 27615 757
metadata(0):
assays(1): logcounts
rownames(27615): TSPAN6 TNMD ... AL356417.3 AP000646.1
rowData names(0):
colnames(757): F45_SK_CD45P_FCAImmP7579224-F45-SK-CD4+T-12-5GEX F45_SK_CD45P_FCAImmP7579224-F45-SK-CD8+T-12-5GEX ...
F50_SP_CD45P_FCAImmP7803020-F50-SP-IMMATURE_B-15-5GEX F30_TH_CD45N_FCAImmP7277565-F30-TH-ABT(ENTRY)-14-3GEX
colData names(7): Sample donor ... method n_cells
reducedDimNames(0):
altExpNames(0):
EDA with PCA
sce <- runPCA(sce, scale=TRUE, ncomponents=30,
exprs_values = "logcounts", subset_row=all_hvgs)
plotPCA(sce, colour_by="donor", ncomponents=6)
plotPCA(sce, colour_by="method", ncomponents=6)
plotPCA(sce, colour_by="organ", ncomponents=10)
Minimize obvious technical effects (3GEX/5GEX, donor) using linear regression (following procedure from OSCA)
## Regress technical effects
design <- model.matrix(~donor+method,data=colData(sce))
residuals <- regressBatches(sce, assay.type = "logcounts", design = design)
assay(sce, "corrected_logcounts") <- as.matrix(assay(residuals[,colnames(sce)], "corrected"))
## Regress organ (soup effect)
design <- model.matrix(~organ,data=colData(sce)) ## Include organ term to capture soup
residuals <- regressBatches(sce, assay.type = "corrected_logcounts", design = design)
assay(sce, "corrected_logcounts") <- as.matrix(assay(residuals[,colnames(sce)], "corrected"))
Check regression has an effect repeating PCA
sce <- runPCA(sce, scale=TRUE, ncomponents=30, exprs_values = "corrected_logcounts")
plotPCA(sce, colour_by="method", ncomponents=6)
plotPCA(sce, colour_by="donor", ncomponents=6)
plotPCA(sce, colour_by="organ", ncomponents=8)
## Feature selection w scran WITHIN CELLTYPE
anno_groups <- split(colnames(sce), sce$anno_lvl_2_final_clean)
all_hvgs <- c()
for (i in anno_groups){
dec <- modelGeneVar(sce[,i], assay.type = "corrected_logcounts")
hvgs <- getTopHVGs(dec, n = 1000)
all_hvgs <- union(all_hvgs, hvgs)
}
Make MOFA object (Use celltypes as grouping covariate)
mofa_obj <- readRDS(glue('{indir}LYMPHOID_mofa_obj_organCorrected_filteredDonors.RDS'))
Error in glue("{indir}LYMPHOID_mofa_obj_organCorrected_filteredDonors.RDS") :
could not find function "glue"
object <- mofa_obj
Prepare 4 training
data_opts <- get_default_data_options(object)
data_opts$use_float32 <- TRUE
data_opts$center_groups <- FALSE
object@data_options <- data_opts
model_opts <- get_default_model_options(object)
model_opts$num_factors <- 30
# model_opts$ard_factors <- FALSE
train_opts <- get_default_training_options(object)
train_opts$seed <- 2020
train_opts$convergence_mode <- "medium" # use "fast" for faster training
train_opts$stochastic <- FALSE
# mefisto_opts <- get_default_mefisto_options(object)
# mefisto_opts$warping <- FALSE
# mefisto_opts$sparseGP <- TRUE
object <- prepare_mofa(
object = object,
data_options = data_opts,
model_options = model_opts,
training_options = train_opts
)
Some group(s) have less than 10 samples, MOFA will have little power to learn meaningful factors for these group(s)...
# Multi-group mode requested.
This is an advanced option, if this is the first time that you are running MOFA, we suggest that you try do some exploration first without specifying groups. Two important remarks:
- The aim of the multi-group framework is to identify the sources of variability *within* the groups. If your aim is to find a factor that 'separates' the groups, you DO NOT want to use the multi-group framework. Please see the FAQ on the MOFA2 webpage.
- It is important to account for the group effect before selecting highly variable features (HVFs). We suggest that either you calculate HVFs per group and then take the union, or regress out the group effect before HVF selection
Checking data options...
Due to string size limitations in the HDF5 format, sample names will be trimmed to less than 50 charactersChecking training options...
Checking model options...
object
Untrained MOFA model with the following characteristics:
Number of views: 1
Views names: corrected_logcounts
Number of features (per view): 7481
Number of groups: 23
Groups names: ABT(ENTRY) B1 CD4+T CD8+T CD8AA CYCLING_MPP CYCLING_NK CYCLING_T HSC_MPP ILC3 IMMATURE_B LARGE_PRE_B LATE_PRO_B LMPP_ELP MATURE_B MEMP NK NK_T PRE_PRO_B PRO_B SMALL_PRE_B TH17 TREG
Number of samples (per group): 18 29 47 41 16 15 42 28 22 35 26 48 29 4 47 19 62 38 34 44 42 34 37
Wrapped in run_mofa.R
outfile <- glue('{indir}{split}_mofa_model_oneview_organCorrected_filteredDonors.hdf5')
mofa_trained <- run_mofa(object, outfile = outfile)
Connecting to the mofapy2 python package using reticulate (use_basilisk = FALSE)...
Please make sure to manually specify the right python binary when loading R with reticulate::use_python(..., force=TRUE) or the right conda environment with reticulate::use_condaenv(..., force=TRUE)
If you prefer to let us automatically install a conda environment with 'mofapy2' installed using the 'basilisk' package, please use the argument 'use_basilisk = TRUE'
mofapy2_0.6.4 is not detected in the specified python binary, see reticulate::py_config(). Setting use_basilisk = TRUE...Connecting to the mofapy2 package using basilisk.
Set 'use_basilisk' to FALSE if you prefer to manually set the python binary using 'reticulate'.
/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): ...working... failed
CondaError: KeyboardInterrupt
Error: Error 2 occurred creating conda environment /home/jovyan/.cache/basilisk/1.2.1/MOFA2-1.3.4/mofa_env
### Tweaking the MOFA2 loading function because the quality control complains
load_model <- function(file, sort_factors = TRUE, on_disk = FALSE, load_data = TRUE,
remove_outliers = FALSE, remove_inactive_factors = TRUE, verbose = FALSE,
load_interpol_Z = FALSE) {
# Create new MOFAodel object
object <- new("MOFA")
object@status <- "trained"
# Set on_disk option
if (on_disk) {
object@on_disk <- TRUE
} else {
object@on_disk <- FALSE
}
# Get groups and data set names from the hdf5 file object
h5ls.out <- h5ls(file, datasetinfo = FALSE)
########################
## Load training data ##
########################
# Load names
if ("views" %in% h5ls.out$name) {
view_names <- as.character( h5read(file, "views")[[1]] )
group_names <- as.character( h5read(file, "groups")[[1]] )
feature_names <- h5read(file, "features")[view_names]
sample_names <- h5read(file, "samples")[group_names]
} else { # for old models
feature_names <- h5read(file, "features")
sample_names <- h5read(file, "samples")
view_names <- names(feature_names)
group_names <- names(sample_names)
h5ls.out <- h5ls.out[grep("variance_explained", h5ls.out$name, invert = TRUE),]
}
if("covariates" %in% h5ls.out$name){
covariate_names <- as.character( h5read(file, "covariates")[[1]])
} else {
covariate_names <- NULL
}
# Load training data (as nested list of matrices)
data <- list(); intercepts <- list()
if (load_data && "data"%in%h5ls.out$name) {
object@data_options[["loaded"]] <- TRUE
if (verbose) message("Loading data...")
for (m in view_names) {
data[[m]] <- list()
intercepts[[m]] <- list()
for (g in group_names) {
if (on_disk) {
# as DelayedArrays
data[[m]][[g]] <- DelayedArray::DelayedArray( HDF5ArraySeed(file, name = sprintf("data/%s/%s", m, g) ) )
} else {
# as matrices
data[[m]][[g]] <- h5read(file, sprintf("data/%s/%s", m, g) )
tryCatch(intercepts[[m]][[g]] <- as.numeric( h5read(file, sprintf("intercepts/%s/%s", m, g) ) ), error = function(e) { NULL })
}
# Replace NaN by NA
data[[m]][[g]][is.nan(data[[m]][[g]])] <- NA # this realised into memory, TO FIX
}
}
# Create empty training data (as nested list of empty matrices, with the correct dimensions)
} else {
object@data_options[["loaded"]] <- FALSE
for (m in view_names) {
data[[m]] <- list()
for (g in group_names) {
data[[m]][[g]] <- .create_matrix_placeholder(rownames = feature_names[[m]], colnames = sample_names[[g]])
}
}
}
object@data <- data
object@intercepts <- intercepts
# Load metadata if any
if ("samples_metadata" %in% h5ls.out$name) {
object@samples_metadata <- bind_rows(lapply(group_names, function(g) as.data.frame(h5read(file, sprintf("samples_metadata/%s", g)))))
}
if ("features_metadata" %in% h5ls.out$name) {
object@features_metadata <- bind_rows(lapply(view_names, function(m) as.data.frame(h5read(file, sprintf("features_metadata/%s", m)))))
}
# ############################
# ## Load sample covariates ##
# ############################
#
# if (any(grepl("cov_samples", h5ls.out$group))){
# covariates <- list()
# for (g in group_names) {
# if (on_disk) {
# # as DelayedArrays
# covariates[[g]] <- DelayedArray::DelayedArray( HDF5ArraySeed(file, name = sprintf("cov_samples/%s", g) ) )
# } else {
# # as matrices
# covariates[[g]] <- h5read(file, sprintf("cov_samples/%s", g) )
# }
# }
# } else covariates <- NULL
# object@covariates <- covariates
# if (any(grepl("cov_samples_transformed", h5ls.out$group))){
# covariates_warped <- list()
# for (g in group_names) {
# if (on_disk) {
# # as DelayedArrays
# covariates_warped[[g]] <- DelayedArray::DelayedArray( HDF5ArraySeed(file, name = sprintf("cov_samples_transformed/%s", g) ) )
# } else {
# # as matrices
# covariates_warped[[g]] <- h5read(file, sprintf("cov_samples_transformed/%s", g) )
# }
# }
# } else covariates_warped <- NULL
# object@covariates_warped <- covariates_warped
# #######################
# ## Load interpolated factor values ##
# #######################
#
# interpolated_Z <- list()
# if (isTRUE(load_interpol_Z)) {
#
# if (isTRUE(verbose)) message("Loading interpolated factor values...")
#
# for (g in group_names) {
# interpolated_Z[[g]] <- list()
# if (on_disk) {
# # as DelayedArrays
# # interpolated_Z[[g]] <- DelayedArray::DelayedArray( HDF5ArraySeed(file, name = sprintf("Z_predictions/%s", g) ) )
# } else {
# # as matrices
# tryCatch( {
# interpolated_Z[[g]][["mean"]] <- h5read(file, sprintf("Z_predictions/%s/mean", g) )
# }, error = function(x) { print("Predicitions of Z not found, not loading it...") })
# tryCatch( {
# interpolated_Z[[g]][["variance"]] <- h5read(file, sprintf("Z_predictions/%s/variance", g) )
# }, error = function(x) { print("Variance of predictions of Z not found, not loading it...") })
# tryCatch( {
# interpolated_Z[[g]][["new_values"]] <- h5read(file, "Z_predictions/new_values")
# }, error = function(x) { print("New values of Z not found, not loading it...") })
# }
# }
# }
# object@interpolated_Z <- interpolated_Z
#######################
## Load expectations ##
#######################
expectations <- list()
node_names <- h5ls.out[h5ls.out$group=="/expectations","name"]
if (verbose) message(paste0("Loading expectations for ", length(node_names), " nodes..."))
if ("AlphaW" %in% node_names)
expectations[["AlphaW"]] <- h5read(file, "expectations/AlphaW")[view_names]
if ("AlphaZ" %in% node_names)
expectations[["AlphaZ"]] <- h5read(file, "expectations/AlphaZ")[group_names]
if ("Sigma" %in% node_names)
expectations[["Sigma"]] <- h5read(file, "expectations/Sigma")
if ("Z" %in% node_names)
expectations[["Z"]] <- h5read(file, "expectations/Z")[group_names]
if ("W" %in% node_names)
expectations[["W"]] <- h5read(file, "expectations/W")[view_names]
if ("ThetaW" %in% node_names)
expectations[["ThetaW"]] <- h5read(file, "expectations/ThetaW")[view_names]
if ("ThetaZ" %in% node_names)
expectations[["ThetaZ"]] <- h5read(file, "expectations/ThetaZ")[group_names]
# if ("Tau" %in% node_names)
# expectations[["Tau"]] <- h5read(file, "expectations/Tau")
object@expectations <- expectations
########################
## Load model options ##
########################
if (verbose) message("Loading model options...")
tryCatch( {
object@model_options <- as.list(h5read(file, 'model_options', read.attributes = TRUE))
}, error = function(x) { print("Model options not found, not loading it...") })
# Convert True/False strings to logical values
for (i in names(object@model_options)) {
if (object@model_options[i] == "False" || object@model_options[i] == "True") {
object@model_options[i] <- as.logical(object@model_options[i])
} else {
object@model_options[i] <- object@model_options[i]
}
}
##########################################
## Load training options and statistics ##
##########################################
if (verbose) message("Loading training options and statistics...")
# Load training options
if (length(object@training_options) == 0) {
tryCatch( {
object@training_options <- as.list(h5read(file, 'training_opts', read.attributes = TRUE))
}, error = function(x) { print("Training opts not found, not loading it...") })
}
# Load training statistics
tryCatch( {
object@training_stats <- h5read(file, 'training_stats', read.attributes = TRUE)
object@training_stats <- h5read(file, 'training_stats', read.attributes = TRUE)
}, error = function(x) { print("Training stats not found, not loading it...") })
#############################
## Load covariates options ##
#############################
#
# if (any(grepl("cov_samples", h5ls.out$group))) {
# if (isTRUE(verbose)) message("Loading covariates options...")
# tryCatch( {
# object@mefisto_options <- as.list(h5read(file, 'smooth_opts', read.attributes = TRUE))
# }, error = function(x) { print("Covariates options not found, not loading it...") })
#
# # Convert True/False strings to logical values
# for (i in names(object@mefisto_options)) {
# if (object@mefisto_options[i] == "False" | object@mefisto_options[i] == "True") {
# object@mefisto_options[i] <- as.logical(object@mefisto_options[i])
# } else {
# object@mefisto_options[i] <- object@mefisto_options[i]
# }
# }
#
# }
#
#######################################
## Load variance explained estimates ##
#######################################
if ("variance_explained" %in% h5ls.out$name) {
r2_list <- list(
r2_total = h5read(file, "variance_explained/r2_total")[group_names],
r2_per_factor = h5read(file, "variance_explained/r2_per_factor")[group_names]
)
object@cache[["variance_explained"]] <- r2_list
}
# Hack to fix the problems where variance explained values range from 0 to 1 (%)
if (max(sapply(object@cache$variance_explained$r2_total,max,na.rm=TRUE),na.rm=TRUE)<1) {
for (m in 1:length(view_names)) {
for (g in 1:length(group_names)) {
object@cache$variance_explained$r2_total[[g]][[m]] <- 100 * object@cache$variance_explained$r2_total[[g]][[m]]
object@cache$variance_explained$r2_per_factor[[g]][,m] <- 100 * object@cache$variance_explained$r2_per_factor[[g]][,m]
}
}
}
##############################
## Specify dimensionalities ##
##############################
# Specify dimensionality of the data
object@dimensions[["M"]] <- length(data) # number of views
object@dimensions[["G"]] <- length(data[[1]]) # number of groups
object@dimensions[["N"]] <- sapply(data[[1]], ncol) # number of samples (per group)
object@dimensions[["D"]] <- sapply(data, function(e) nrow(e[[1]])) # number of features (per view)
# object@dimensions[["C"]] <- nrow(covariates[[1]]) # number of covariates
object@dimensions[["K"]] <- ncol(object@expectations$Z[[1]]) # number of factors
# Assign sample and feature names (slow for large matrices)
if (verbose) message("Assigning names to the different dimensions...")
# Create default features names if they are null
if (is.null(feature_names)) {
print("Features names not found, generating default: feature1_view1, ..., featureD_viewM")
feature_names <- lapply(seq_len(object@dimensions[["M"]]),
function(m) sprintf("feature%d_view_&d", as.character(seq_len(object@dimensions[["D"]][m])), m))
} else {
# Check duplicated features names
all_names <- unname(unlist(feature_names))
duplicated_names <- unique(all_names[duplicated(all_names)])
if (length(duplicated_names)>0)
warning("There are duplicated features names across different views. We will add the suffix *_view* only for those features
Example: if you have both TP53 in mRNA and mutation data it will be renamed to TP53_mRNA, TP53_mutation")
for (m in names(feature_names)) {
tmp <- which(feature_names[[m]] %in% duplicated_names)
if (length(tmp)>0) feature_names[[m]][tmp] <- paste(feature_names[[m]][tmp], m, sep="_")
}
}
features_names(object) <- feature_names
# Create default samples names if they are null
if (is.null(sample_names)) {
print("Samples names not found, generating default: sample1, ..., sampleN")
sample_names <- lapply(object@dimensions[["N"]], function(n) paste0("sample", as.character(seq_len(n))))
}
samples_names(object) <- sample_names
# Add covariates names
# if(!is.null(object@covariates)){
# # Create default covariates names if they are null
# if (is.null(covariate_names)) {
# print("Covariate names not found, generating default: covariate1, ..., covariateC")
# covariate_names <- paste0("sample", as.character(seq_len(object@dimensions[["C"]])))
# }
# covariates_names(object) <- covariate_names
# }
# Set views names
if (is.null(names(object@data))) {
print("Views names not found, generating default: view1, ..., viewM")
view_names <- paste0("view", as.character(seq_len(object@dimensions[["M"]])))
}
views_names(object) <- view_names
# Set groups names
if (is.null(names(object@data[[1]]))) {
print("Groups names not found, generating default: group1, ..., groupG")
group_names <- paste0("group", as.character(seq_len(object@dimensions[["G"]])))
}
groups_names(object) <- group_names
# Set factors names
factors_names(object) <- paste0("Factor", as.character(seq_len(object@dimensions[["K"]])))
###################
## Parse factors ##
###################
# Calculate variance explained estimates per factor
if (is.null(object@cache[["variance_explained"]])) {
object@cache[["variance_explained"]] <- calculate_variance_explained(object)
}
# Remove inactive factors
if (remove_inactive_factors) {
r2 <- rowSums(do.call('cbind', lapply(object@cache[["variance_explained"]]$r2_per_factor, rowSums, na.rm=TRUE)))
var.threshold <- 0.0001
if (all(r2 < var.threshold)) {
warning(sprintf("All %s factors were found to explain little or no variance so remove_inactive_factors option has been disabled.", length(r2)))
} else if (any(r2 < var.threshold)) {
object <- subset_factors(object, which(r2>=var.threshold))
message(sprintf("%s factors were found to explain no variance and they were removed for downstream analysis. You can disable this option by setting load_model(..., remove_inactive_factors = FALSE)", sum(r2 < var.threshold)))
}
}
# [Done in mofapy2] Sort factors by total variance explained
if (sort_factors && object@dimensions$K>1) {
# Sanity checks
if (verbose) message("Re-ordering factors by their variance explained...")
# Calculate variance explained per factor across all views
r2 <- rowSums(sapply(object@cache[["variance_explained"]]$r2_per_factor, function(e) rowSums(e, na.rm = TRUE)))
order_factors <- c(names(r2)[order(r2, decreasing = TRUE)])
# re-order factors
object <- subset_factors(object, order_factors)
}
# Mask outliers
if (remove_outliers) {
if (verbose) message("Removing outliers...")
object <- .detect_outliers(object)
}
# Mask intercepts for non-Gaussian data
if (any(object@model_options$likelihoods!="gaussian")) {
for (m in names(which(object@model_options$likelihoods!="gaussian"))) {
for (g in names(object@intercepts[[m]])) {
object@intercepts[[m]][[g]] <- NA
}
}
}
# ######################
# ## Quality controls ##
# ######################
#
# if (verbose) message("Doing quality control...")
# object <- .quality_control(object, verbose = verbose)
#
return(object)
}
mofa_trained <- load_model(outfile)
samples_names(mofa_trained) <- samples_names(object)
samples_metadata(mofa_trained)
rownames(samples_metadata(mofa_trained)) <- samples_metadata(mofa_trained)[["sample"]]
get_variance_explained(mofa_trained, as.data.frame = TRUE, )[[1]] %>%
dplyr::mutate(group=factor(group, levels=rev(anno_order))) %>%
ggplot(aes(factor,group, fill=value)) +
geom_tile() +
scale_fill_viridis_c() +
theme(axis.text.x = element_text(angle=45, hjust=1))
get_variance_explained(mofa_trained, as.data.frame = TRUE, )[[2]] %>%
dplyr::mutate(group=factor(group, levels=anno_order)) %>%
ggplot(aes(group, value)) +
geom_col() +
coord_flip() +
ylab("Var. (%)") +
theme_classic(base_size=14)
Do factors relate to the number of cells in pseudobulk?
n_cells <- mofa_trained@samples_metadata[,'n_cells', drop=FALSE]
Z <- get_factors(mofa_trained)
Z <- purrr::reduce(Z, rbind)
barplot(cor(n_cells, Z))
Distinguish technical factors by weight sparsity
get_weights(mofa_trained, abs=TRUE, scale = FALSE, as.data.frame = TRUE) %>%
dplyr::group_by(factor) %>%
dplyr::mutate(value=(value - min(value))/(max(value)- min(value)), rank=rank(value)) %>%
dplyr::summarise(frac_zeros=sum(value < 0.05)/dplyr::n()) %>%
ggplot(aes(factor, frac_zeros)) +
geom_col() +
coord_flip() +
geom_hline(yintercept = 0.5, color="red")
exclude_factors <- get_weights(mofa_trained, abs=TRUE, scale = FALSE, as.data.frame = TRUE) %>%
dplyr::group_by(factor) %>%
dplyr::mutate(value=(value - min(value))/(max(value)- min(value)), rank=rank(value)) %>%
dplyr::summarise(frac_zeros=sum(value < 0.05)/dplyr::n()) %>%
dplyr::filter(frac_zeros < 0.55) %>%
dplyr::pull(factor)
high_r2_groups_df <- get_variance_explained(mofa_trained, as.data.frame = TRUE)[[1]] %>%
dplyr::filter(!factor %in% exclude_factors) %>%
dplyr::group_by(group) %>%
dplyr::mutate(tot_var=sum(value)) %>%
dplyr::ungroup() %>%
dplyr::arrange(tot_var) %>%
dplyr::mutate(group=factor(group, levels = unique(group))) %>%
dplyr::mutate(value=ifelse(value < 5, NA, value)) %>%
dplyr::filter(!is.na(value))
high_r2_groups_df %>%
ggplot(aes(factor,group, fill=value)) +
geom_tile() +
scale_fill_gradientn(colors=c("gray97","darkblue"), guide="colorbar") +
theme_classic() +
theme(axis.text.x = element_text(angle=45, hjust=1))
Calculating Adjusted Mutual Information between organ identity and clustering of pseudobulks based on factor values
calc_organ_AMI <- function(f, g){
dmat <- dist(get_factors(mofa_trained, factors = f, groups = g)[[1]])
hcl <- hclust(dmat)
n_organs <- length(unique(samples_metadata(mofa_trained)[rownames(as.matrix(dmat)),'organ']))
hcl_df <- data.frame(clust=cutree(hcl, k=n_organs)) %>%
tibble::rownames_to_column("sample") %>%
dplyr::left_join(samples_metadata(mofa_trained))
organ_AMI <- aricode::AMI(hcl_df$clust, as.numeric(hcl_df$organ))
return(organ_AMI)
}
samples_metadata(mofa_trained)$organ <- as.factor(samples_metadata(mofa_trained)$organ)
## Calc adjusted mutual info for each factor
AMIs <- sapply(1:nrow(high_r2_groups_df), function(i) calc_organ_AMI(high_r2_groups_df$factor[i], high_r2_groups_df$group[i]))
## Calc adjusted mutual info for all factors that explan > 2% variance
groups <- as.character(unique(high_r2_groups_df$group))
ct_AMIs <- sapply(groups, function(g) calc_organ_AMI(high_r2_groups_df$factor[high_r2_groups_df$group == g], g))
AMI_pl <- data.frame(ct_AMIs) %>%
dplyr::arrange(ct_AMIs) %>%
tibble::rownames_to_column("celltype") %>%
# dplyr::mutate(celltype=factor(rowname, levels=unique(rowname))) %>%
dplyr::mutate(celltype=factor(celltype, levels=rev(anno_order))) %>%
ggplot(aes(ct_AMIs, celltype)) +
geom_col() +
xlab("Total Organ AMI") +
theme_classic(base_size = 16)
AMI_f_pl <- high_r2_groups_df %>%
dplyr::mutate(org_AMI=AMIs) %>%
dplyr::mutate(group=factor(group, levels=levels(AMI_pl$data$celltype))) %>%
ggplot(aes(factor, group, fill=org_AMI)) +
geom_tile(color='black') +
# scale_fill_gradientn(colors=c("gray97","red"), name="Organ Adj. Mutual Info") +
scale_fill_viridis_c(option='magma') +
theme_classic(base_size = 16) +
theme(axis.text.x = element_text(angle=45, hjust=1))
AMI_f_pl + (AMI_pl + remove_y_axis()) +
plot_layout(guides="collect", widths = c(8,3))
## Save info on MI
high_r2_groups_df <- high_r2_groups_df %>%
dplyr::mutate(org_AMI=AMIs)
## Fix long names for plotting
all_groups <- names(get_data(mofa_trained)[[1]])
group_labeller <- all_groups %>%
str_replace_all("_", " ") %>%
{ifelse(nchar(.) > 20, str_replace(., " ", "\n"), .)} %>%
setNames(all_groups)
AMI_pl_df <- high_r2_groups_df %>%
dplyr::group_by(group) %>%
dplyr::mutate(mean_AMI=max(org_AMI)) %>%
dplyr::ungroup() %>%
dplyr::arrange(mean_AMI) %>%
dplyr::mutate(group=group_labeller[as.character(group)]) %>%
dplyr::mutate(group=factor(group, levels=unique(group)))
AMI_pl_df %>%
ggplot(aes(org_AMI, group)) +
geom_point(aes(fill=value), size=3, shape=21) +
ggrepel::geom_text_repel(aes(label=str_remove(factor, "Factor")), color="black", force = 0.1, direction = 'x',
nudge_y = 0.4,
hjust = 0) +
xlab("Adj. Mutual Information - Organ ") +
scale_fill_gradientn(colours = c("white", "red"), name="% var. explained") +
theme_bw(base_size = 15)
for (fact in as.character(unique(AMI_pl_df$factor))){
p <- AMI_pl_df %>%
ggplot(aes(org_AMI, group)) +
geom_point(fill="grey", size=2, shape=21, color="grey") +
geom_point(data = . %>% dplyr::filter(factor==fact),
aes(fill=value), size=3, shape=21) +
xlab("Adj. Mutual Information - Organ ") +
scale_fill_gradientn(colours = c("white", "red"), name="% var. explained") +
theme_bw(base_size = 15) +
ggtitle(fact)
print(p)
}
get_top_weight_genes <- function(mofa_trained, f, n_top=20, which="top"){
w_df <- get_weights(mofa_trained, factors = f, as.data.frame = TRUE) %>%
dplyr::arrange(value)
top_genes <- w_df %>%
dplyr::top_n(n_top, value) %>%
dplyr::pull(feature) %>%
as.character()
bot_genes <- w_df %>%
dplyr::top_n(n_top, -value) %>%
dplyr::pull(feature) %>%
as.character()
if (which=="top") {
genes <- top_genes
} else if (which=="bottom"){
genes <- bot_genes
} else if (which=="both"){
genes <- c(top_genes, bot_genes)
}
return(genes)
}
plot_data_top_weights <- function(mofa_trained, ct, f, n_top=20, which="top"){
genes <- get_top_weight_genes(mofa_trained, f, which=which, n_top=n_top)
data <- get_data(mofa_trained, groups=ct)[[1]][[1]][genes,]
pl_df <- reshape2::melt(data, varnames=c("gene", "sample")) %>%
dplyr::left_join(samples_metadata(mofa_trained)) %>%
dplyr::arrange(age) %>%
dplyr::mutate(sample=factor(sample, levels=unique(sample))) %>%
dplyr::group_by(gene) %>%
dplyr::mutate(value=scale(value))
pl_df %>%
ggplot(aes(sample, gene, fill=value)) +
geom_tile() +
facet_grid(.~organ, space="free", scales="free") +
scale_fill_gradient2(high="red", low="blue", name="Scaled\nexpression") +
xlab("----age--->") + ylab(glue("{which} weight genes")) +
theme_bw(base_size=16) +
theme(axis.ticks.x = element_blank(), axis.text.x = element_blank()) +
ggtitle(glue('{ct} - {f}'))
}
for (g in all_groups){
fs <- get_top_factor_per_celltype(mofa_trained, g, min_R2=5)
fs <- fs[!fs %in% exclude_factors]
if (length(fs) > 0){
top_plots <- lapply(fs, function(x) (plot_data_top_weights(mofa_trained, g, x, which="top") + remove_x_axis()) /
plot_data_top_weights(mofa_trained, g, x, which="bottom") + ggtitle("")
)
full_pl <-wrap_plots(top_plots, ncol=1)
ggsave(glue("{figdir}/top_factors_expr_{g}.pdf"),plot=full_pl, width=12, height = 10*length(top_plots))
}
}
minmax_normalize <- function(x, na.rm = TRUE) {
return((x- min(x)) /(max(x)-min(x)))
}
plot_data_top_weights_clustered <- function(mofa_trained, cts, f, n_top=20, which="top", scale_data=TRUE){
genes <- get_top_weight_genes(mofa_trained, f, which=which, n_top=n_top)
genes_anno <- data.frame(gene=genes)
if (which!="both"){ genes_anno[["weight"]] <- rep(which, n_top) } else { genes_anno[["weight"]] <- c(rep("top", n_top), rep("bottom", n_top)) }
data_ls <- get_data(mofa_trained, groups=cts)[[1]]
data <- Reduce(cbind, data_ls)[genes,]
ct_pl_ls <- lapply(cts, function(ct){
ct_samples <- colnames(mofa_trained@data[[1]][[ct]])
ct_data <- data[,ct_samples]
if (scale_data){
ct_data <- t(apply(ct_data, 1, minmax_normalize))
}
cl_heatmap <- pheatmap::pheatmap(ct_data, show_colnames= FALSE, cluster_rows = FALSE, )
col_order <- cl_heatmap$tree_col$labels[cl_heatmap$tree_col$order]
pl_df <- reshape2::melt(ct_data, varnames=c("gene", "sample")) %>%
dplyr::left_join(samples_metadata(mofa_trained)) %>%
dplyr::left_join(genes_anno) %>%
dplyr::mutate(sample=factor(sample, levels=col_order),
weight=factor(weight, levels=c("top", "bottom")))
pl_bar <- pl_df %>%
ggplot(aes(sample, "organ", fill=organ)) +
geom_tile() +
scale_fill_manual(values=org_colors) +
theme_void() +
theme(legend.position = "none")
pl_hm <- pl_df %>%
ggplot(aes(sample, gene, fill=value)) +
geom_tile() +
scale_fill_viridis_c(option="magma", name="Scaled\nexpression") +
xlab(group_labeller[ct]) +
facet_grid(weight~., scales="free", space="free") +
theme_bw(base_size=12) +
theme(axis.ticks.x = element_blank(), axis.text.x = element_blank())
(pl_bar / pl_hm) + plot_layout(heights = c(1,10))
})
if (length(ct_pl_ls) > 1){
## Remove gene names to all except 1st plot
ct_pl_ls[2:length(ct_pl_ls)] <- lapply(ct_pl_ls[2:length(ct_pl_ls)], function(p) p + remove_y_axis())
## Remove strip names to all except last plot
ct_pl_ls[1:(length(ct_pl_ls)-1)] <- lapply(ct_pl_ls[1:(length(ct_pl_ls)-1)], function(p) p + theme(strip.background = element_blank(),strip.text.y = element_blank()))
wrap_plots(ct_pl_ls) +
plot_layout(guides="collect", nrow = 1)
} else {
ct_pl_ls[[1]]
}
}
plot_factor_organ_boxplots <- function(f, cts){
pl_ls <- lapply(cts, function(g) plot_factor(mofa_trained,groups = c(g), color_by="organ", dot_size = 3, factors = f,
add_boxplot = TRUE, boxplot_alpha = 0.1,
group_by = 'group', dodge = TRUE) +
scale_fill_manual(values=org_colors) +
scale_color_manual(values=org_colors) +
ggtitle(group_labeller[g])
)
wrap_plots(pl_ls) +
plot_layout(guides="collect", nrow=1) +
plot_annotation(title=f)
}
high_r2_groups_df_filt <- high_r2_groups_df %>%
dplyr::filter(org_AMI > 0.3) %>%
dplyr::arrange(- org_AMI)
for (fact in as.character(unique(high_r2_groups_df_filt$factor))){
fact_cts = as.character(high_r2_groups_df_filt$group[high_r2_groups_df_filt$factor==fact])
p_top <- plot_factor_organ_boxplots(cts=fact_cts, f=fact)
p_bottom <- plot_data_top_weights_clustered(mofa_trained, cts=fact_cts, f=fact, which = "both", scale_data = TRUE)
f_pl <- (p_top / p_bottom) +
plot_layout(heights = c(1,2.5))
ggsave(glue("{figdir}/{fact}_top_organ_AMI_plot.pdf"), plot=f_pl, width=5 + (3*length(fact_cts)), height = 9)
}
get_factors(mofa_trained, as.data.frame = TRUE) %>%
left_join(samples_metadata(mofa_trained)) %>%
mutate(sort=ifelse(str_detect(Sample, "CD45P"), "CD45+", ifelse(str_detect(Sample,"CD45N"), "CD45-", ifelse(str_detect(Sample, "TOT"), "TOT", "other")))) %>%
filter(organ=="TH" & factor=="Factor2" & group=="CD8AA") %>%
ggplot(aes(value,Sample, color=sort)) +
geom_point() +
ggtitle("TREG") +
xlab("Factor2")
p1 <- get_factors(mofa_trained, as.data.frame = TRUE) %>%
left_join(samples_metadata(mofa_trained)) %>%
filter(organ=="TH" & factor=="Factor2" & group=="TREG") %>%
ggplot(aes(value,Sample, color=age)) +
geom_point() +
scale_color_viridis_c() +
ggtitle("TREG") +
xlab("Factor2")
p2 <- get_factors(mofa_trained, as.data.frame = TRUE) %>%
left_join(samples_metadata(mofa_trained)) %>%
filter(organ=="TH" & factor=="Factor2" & group=="TH17") %>%
ggplot(aes(value,Sample, color=age)) +
geom_point() +
scale_color_viridis_c() +
ggtitle("TH17") +
xlab("Factor2")
p3 <- get_factors(mofa_trained, as.data.frame = TRUE) %>%
left_join(samples_metadata(mofa_trained)) %>%
filter(organ=="TH" & factor=="Factor2" & group=="CD8AA") %>%
ggplot(aes(value,Sample, color=age)) +
geom_point() +
scale_color_viridis_c() +
ggtitle("CD8AA") +
xlab("Factor2")
(p1 / p2 /p3) + plot_layout(guides='collect')
plot_variance_explained(mofa_trained, x='factor', y='group', split_by = 'view', plot_total = TRUE, max_r2 = 50)[[1]] +
theme(axis.text.x = element_text(angle=45, hjust=1))
get_variance_explained(mofa_trained, as.data.frame = TRUE)[[2]] %>%
ggplot(aes(group, value)) +
geom_col() +
coord_flip() +
ylab("Var. (%)") +
theme_classic(base_size=14)
Plot by celltype
get_variance_explained(mofa_trained, as.data.frame = TRUE)[[1]] %>%
ggplot(aes(factor, value)) + geom_col() +
coord_flip() +
facet_wrap(group~., ncol = 6, scales = "free_x")
plot_factor_cor(mofa_trained, method = "spearman")
## Correlation with principal components
pcs <- reducedDim(sce)
fctrs <- get_factors(mofa_trained) %>%
purrr::reduce(rbind)
corrplot::corrplot(cor(pcs, fctrs[rownames(pcs),]))
plot_factor_ordered <- function(mofa_trained, f){
factor_df <- get_factors(mofa_trained, factors = f, as.data.frame = TRUE) %>%
mutate(organ = sapply(str_split(sample, "_"), function(x) x[2])) %>%
group_by(group) %>%
mutate(gr_mean = median(value)) %>%
ungroup() %>%
arrange(gr_mean) %>%
mutate(group=factor(group, levels=unique(group)))
r2_df <- get_variance_explained(mofa_trained, factors = f, as.data.frame = TRUE)[[1]] %>%
filter(factor==paste0('Factor',f)) %>%
mutate(group=factor(group, levels = levels(factor_df$group)))
pl1 <- factor_df %>%
ggplot(aes(group, value)) +
geom_boxplot() +
geom_jitter(aes(color= organ), size=0.7) +
geom_hline(yintercept = 0, linetype=2) +
coord_flip() +
ylab(paste0("Factor ", f)) +
theme_bw(base_size = 14)
pl2 <- r2_df %>%
ggplot(aes(group, value)) +
geom_col() +
coord_flip() +
ylab("% variance explained") +
theme_bw(base_size = 14) +
remove_y_axis()
pl1 + pl2 + plot_layout(widths=c(2,1), guides="collect")
}
get_top_celltype_per_factor <- function(mofa_trained, f){
r2_df <- get_variance_explained(mofa_trained, factors = f, as.data.frame = TRUE)[[1]] %>%
filter(factor==paste0('Factor',f))
# mutate(group=factor(group, levels = ))
top_quant_r2 <- quantile(r2_df$value, probs = seq(0, 1, by = 0.2))["80%"]
top_groups <- r2_df$group[r2_df$value >= top_quant_r2]
return(top_groups)
}
save_factor_id <- function(mofa_trained, f, figdir){
## Order celltypes by factor values
p1 <- plot_factor_ordered(mofa_trained, f)
## Plot factor values across organs for celltypes with high variance explained
p2 <- plot_factor(mofa_trained, factors = f, groups = get_top_celltype_per_factor(mofa_trained, f), group_by = "group",
color_by = "organ",
dot_size = 2, dodge = TRUE
)
## Plot factor weights on genes
# plot_data_heatmap(mofa_trained, factor = f, nfeatures = 50, text_size = 3, show_colnames=FALSE,
# annotation_samples = c("organ", "time", "method", "donor"))
p3 <- plot_weights(mofa_trained, factors = f, nfeatures = 30, text_size = 3) +
scale_y_discrete(expand=c(0.1, 0.1))
full_pl <- (p1 | (p2 / p3)) +
plot_layout(guides="collect")
ggsave(glue("{figdir}/MOFA_{split}_factorID_factor{f}.pdf"), plot=full_pl, width = 15, height = 10)
}
for (f in 1:mofa_trained@dimensions$K){
print(paste0("Saving ID for Factor ", f, "..."))
save_factor_id(mofa_trained, f=f, figdir = figdir)
}
# save_factor_id(mofa_trained, f=1, figdir = figdir)
# plot_weights(mofa_trained, factors = f, nfeatures = 30, text_size = 3) +
# scale_y_discrete(expand=c(0.1, 0.1))
## Get factors that explain most variance in each celltype
get_top_factor_per_celltype <- function(mofa_trained, gr, min_R2=2){
get_variance_explained(mofa_trained, as.data.frame = TRUE)[[1]] %>%
filter(group==gr) %>%
filter(value >= min_R2) %>%
pull(factor) %>%
as.character()
}
## Make KNN graph based on similarity of top factors for each celltype
get_ct_KNN_graph <- function(mofa_trained, gr, min_R2=5, k=5){
## Get factors that explain most variance per celltype
fs <- get_top_factor_per_celltype(mofa_trained, gr, min_R2 = min_R2)
## Exclude factor1 (proliferation)
fs <- fs[!fs %in% c("Factor1", "Factor2")]
## Make KNN graph from top factors
Z <- get_factors(mofa_trained, groups=gr, factors = fs)[[1]]
knn_ct <- buildKNNGraph(t(Z), k=k)
## Add attributes
metadata_ct <- samples_metadata(mofa_trained)[rownames(Z),]
# covariates
V(knn_ct)$organ <- metadata_ct$organ
V(knn_ct)$age <- metadata_ct$age
V(knn_ct)$n_cells <- metadata_ct$n_cells
V(knn_ct)$method <- metadata_ct$method
V(knn_ct)$donor <- metadata_ct$donor
# top factors
for (c in colnames(Z)){
vertex_attr(knn_ct)[[c]] <- Z[,c]
}
return(knn_ct)
}
## Plot KNN graph
plot_ct_KNN_graph <- function(knn, color_by="organ"){
## Define color
if (!color_by %in% names(vertex_attr(knn))){
stop("specified color_by variable is not in vertex_attr(knn)")
}
if (color_by=="organ"){
scale_color_knngraph <- scale_color_manual(values=org_colors)
} else if (is.numeric(vertex_attr(knn, color_by))){
scale_color_knngraph <- scale_color_viridis_c(option="magma")
} else {
scale_color_knngraph <- scale_color_discrete()
}
vertex_attr(knn, "color_by") <- vertex_attr(knn, color_by)
ggraph(knn) +
geom_edge_link0() +
geom_node_point(aes(color=color_by, size=n_cells)) +
theme(panel.background = element_blank()) +
scale_color_knngraph +
scale_size(range=c(2,7))
}
get_top_factor_per_celltype(mofa_trained, "NK")
all_groups <- names(get_data(mofa_trained)[[1]])
knn_graph_pl <- lapply(all_groups, function(g){
knn <- get_ct_KNN_graph(mofa_trained, g, k=5, min_R2 = 2)
plot_ct_KNN_graph(knn, color_by = 'organ') + ggtitle(g)
})
knn_graph_pl <- setNames(knn_graph_pl, all_groups)
knn <- get_ct_KNN_graph(mofa_trained, 'B1', k=5, min_R2 = 1)
plot_ct_KNN_graph(knn, color_by = 'Factor5')
plot_factor(mofa_trained,groups = 'MATURE_B', factors = c(5), group_by ='organ', color_by = "age")
## Score connectivity between samples from the same organ
.calc_connectivity_score <- function(knn, o){
adj <- get.adjacency(knn)
n_org <- sum(V(knn)$organ==o)
n_other <- sum(V(knn)$organ!=o)
within_edges <- sum(adj[V(knn)$organ==o,V(knn)$organ==o])
between_edges <- sum(adj[V(knn)$organ==o,V(knn)$organ!=o])
score <- (within_edges/between_edges)*(n_other/n_org)
return(score)
}
## Calculate connectivity score for permutations of node labels
conn_score_test <- function(knn, o, n_perm=1000){
real_score <- .calc_connectivity_score(knn, o)
## Random permutations
rand_scores <- c()
for (i in 1:n_perm){
rand_knn <- knn
V(rand_knn)$organ <- sample(V(knn)$organ)
rand_scores <- c(rand_scores, .calc_connectivity_score(rand_knn, o))
}
p_val <- sum(c(rand_scores, real_score) >= real_score)/(n_perm + 1)
if (p_val < 2e-16){ p_val <- 2e-16}
return(c('score'=real_score,'p_value'=p_val))
}
## Calculate connectivity score + significance with permutation test
test_conn_group <- function(mofa_trained, g, k=5, min_R2 = 2, n_perm=1000){
knn <- get_ct_KNN_graph(mofa_trained, g, k=k, min_R2 = min_R2)
test_orgs <- names(table(V(knn)$organ))[table(V(knn)$organ) > 2]
return(sapply(test_orgs, function(o) conn_score_test(knn, o, n_perm=n_perm)))
}
connectivity_test_ls <- lapply(all_groups, function(g) test_conn_group(mofa_trained, g))
connectivity_test_ls <- setNames(connectivity_test_ls, all_groups)
connectivity_test_df <- imap(connectivity_test_ls, ~ data.frame(t(.x)) %>% rownames_to_column("organ") %>% mutate(group=.y)) %>%
purrr::reduce(bind_rows) %>%
mutate(is_signif = ifelse(p_value < 0.01, TRUE, FALSE))
connectivity_test_df %>%
ggplot(aes(organ, group,fill=log10(score))) +
geom_tile() +
scale_fill_distiller(palette="Reds", direction = 1) +
geom_text(data=. %>% filter(is_signif), label="*", size=5)
connectivity_test_df %>%
group_by(group) %>%
mutate(mean_val=median(score)) %>%
ungroup() %>%
arrange(-mean_val) %>%
mutate(group=factor(group, levels=unique(group))) %>%
ggplot(aes(organ, log1p(score))) +
geom_col(fill="grey") +
geom_col(data=. %>% filter(is_signif), aes(fill=organ)) +
scale_fill_manual(values=org_colors) +
coord_flip() +
facet_grid(group~.) +
theme(strip.text.y = element_text(angle=0))
get_top_weight_genes <- function(mofa_trained, f, n_top=20, which="top"){
w_df <- get_weights(mofa_trained, factors = f, as.data.frame = TRUE) %>%
arrange(value)
if (which=="top") {
w_df %>%
top_n(n_top, value) %>%
pull(feature) %>%
as.character()
} else if (which=="bottom"){
w_df %>%
top_n(n_top, -value) %>%
pull(feature) %>%
as.character()
}
}
plot_data_top_weights <- function(mofa_trained, ct, f, n_top=20, which="top"){
genes <- get_top_weight_genes(mofa_trained, f, which=which, n_top=n_top)
data <- get_data(mofa_trained, groups=ct)[[1]][[1]][genes,]
pl_df <- reshape2::melt(data, varnames=c("gene", "sample")) %>%
left_join(samples_metadata(mofa_trained)) %>%
arrange(age) %>%
mutate(sample=factor(sample, levels=unique(sample))) %>%
group_by(gene) %>%
mutate(value=scale(value))
pl_df %>%
ggplot(aes(sample, gene, fill=value)) +
geom_tile() +
facet_grid(.~organ, space="free", scales="free") +
scale_fill_gradient2(high="red", low="blue", name="Scaled\nexpression") +
xlab("----age--->") + ylab(glue("{which} weight genes")) +
theme_bw(base_size=16) +
theme(axis.ticks.x = element_blank(), axis.text.x = element_blank()) +
ggtitle(glue('{ct} - {f}'))
}
for (g in all_groups){
fs <- get_top_factor_per_celltype(mofa_trained, g, min_R2=3)
top_plots <- lapply(fs, function(x) (plot_data_top_weights(mofa_trained, g, x, which="top") + remove_x_axis()) /
plot_data_top_weights(mofa_trained, g, x, which="bottom") + ggtitle("")
)
full_pl <-wrap_plots(top_plots, ncol=1)
ggsave(glue("{figdir}/top_factors_expr_{g}.pdf"),plot=full_pl, width=12, height = 7*length(top_plots))
}
plot_data_heatmap(mofa_trained, factor = 5, groups = "MATURE_B", scale="row", annotation_samples = c("organ", "age"), features = 50)
plot_data_heatmap(mofa_trained, factor = 5, groups = "B1", scale="row", annotation_samples = c("organ", "age"), features = 50)
# BiocManager::install("MOFAdata")
library(MOFAdata)
utils::data(reactomeGS)
head(rownames(reactomeGS))
## Remove row with NA
reactomeGS <- reactomeGS[!is.na(rownames(reactomeGS)),]
library(EnsDb.Hsapiens.v86)
hg.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
all_genes <- ensembldb::genes(EnsDb.Hsapiens.v86)
detach(package:EnsDb.Hsapiens.v86)
detach(package:ensembldb)
# gene_name_2_id <- function(gene){
# return(all_genes[all_genes$gene_name==gene,]$gene_id[1])
# }
#
# gene_ids <- sapply(mofa_trained@features_metadata$feature, gene_name_2_id)
# rowData(sce)["gene_id"] <- gene_ids
# rowData(sce)["gene_name"] <- rownames(sce)
gene_names_reactome <- all_genes[colnames(reactomeGS)]$gene_name
colnames(reactomeGS) <- gene_names_reactome
Subset to genes tested
reactomeGS_universe <- reactomeGS[, colnames(reactomeGS) %in% mofa_trained@features_metadata$feature]
# GSEA on positive weights, with default options
res.positive <- run_enrichment(mofa_trained,
view='scaled_logcounts',
# statistical.test = 'cor.adj.parametric',
feature.sets = reactomeGS_universe,
sign = "positive",
)
# GSEA on negative weights, with default options
res.negative <- run_enrichment(mofa_trained,
view='scaled_logcounts',
# statistical.test = 'cor.adj.parametric',
feature.sets = reactomeGS_universe,
sign = "negative"
)
for (f in 1:mofa_trained@dimensions$K){
if (min(res.positive$pval.adj[,paste0("Factor", f)]) < 0.1) {
print(plot_enrichment(res.positive, factor = f, alpha=0.1) + ggtitle("Positive weights") +
plot_enrichment(res.negative, factor = f, alpha=0.1) + ggtitle("Negative weights") +
plot_annotation(title=paste0("Factor", f)))
}
}
signif_pathways <- rownames(data.frame(res.negative$pval.adj))[order(data.frame(res.negative$pval.adj)[["Factor8"]])[0:10]]
colnames(reactomeGS_universe)[reactomeGS_universe[signif_pathways[5],]==1]
plot_enrichment_detailed(res.negative, factor = 8)
–> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –> –>
–> –> –> –> –>
–> –> –>